setwd("/Volumes/project/bioinformatics/JLee_lab/s224730/Marek/shRNA_18_samples_Human_aug2023/outputs/readCounts/Deseq2_without_C1/Analysis_1.4_9.4/")
library(DESeq2)
library(edgeR)
library(pheatmap)
library(RColorBrewer)
library(ggplot2)
library(gplots)
library(gridExtra)
library("biomaRt")
library(corrplot)
library(ggrepel)
library("plotly")#, lib.loc="~/R/win-library/3.5")
library(biomaRt)
library(knitr)
library(VennDiagram)
library(ggpubr)
library(limma)
library(reshape2)
library(readxl)
library(kableExtra)
library(EnhancedVolcano)
Data <- read.csv("readCount.tsv",
header = TRUE,
sep = '\t',
skip=1)
rownames(Data) <- Data$Geneid
#View(Data)
colnames(Data) <- c("Geneid","Chr","Start","End","Strand","Length","C1_FXNSH2","C1_FXNSH3","C1_Novirus","C1_SCR","C2_FXNSH2","C2_FXNSH3", "C2_Novirus", "C2_SCR", "C3_FXNSH2", "C3_FXNSH3", "C3_Novirus", "C3_SCR", "F1_Novirus", "F1_SCR", "F2_Novirus", "F2_SCR", "F3_Novirus", "F3_SCR")
##=== countData
countData <- as.matrix(Data[,-c(1:10)])
mode(countData) <- "integer"
colData <- read.csv("info1.csv", sep=",", row.names=1)
#View(countData)
#View(colData)
GeneID_table <- Data[,c(1:6)]
#View(GeneID_table)
##===Generate a Gene Symbol to Ensembl/Entrez ID conversion table
#ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl")
#Symbol_table <- getBM(attributes=c("hgnc_symbol","ensembl_gene_id","entrezgene_id"), filters = 'hgnc_symbol', values = rownames(GeneID_table), mart = ensembl)
#write.table(Symbol_table, file="Symbol_table.txt", quote=FALSE, sep="\t")
Data preprocessing, filtering, and normalization
##===Check all sample IDs in colData are also in CountData and match their orders
all(rownames(colData) %in% colnames(countData))
## [1] TRUE
countData <- countData[, rownames(colData)]
all(rownames(colData) == colnames(countData))
## [1] TRUE
#=load a Symbol_table saved in advance
Symbol_table <- read.table("Symbol_table.txt", sep="\t")#, na.strings="")
idx_na <- which(is.na(Symbol_table$entrezgene_id))
Symbol_table <- as.matrix(Symbol_table)
Symbol_table[idx_na,2] <- Symbol_table[idx_na,1]
#=Add features oringinally not contained
idx <- which(!(rownames(countData) %in% Symbol_table[,3]))
tmp <- rownames(countData)[idx]
tmp <- cbind(tmp, tmp, tmp)
Symbol_table <- rbind(Symbol_table, tmp)
Symbol_table <- data.frame(Symbol_table)
rownames(Symbol_table) <- c(1:length(rownames(Symbol_table)))
##===Gene filtering
#countData_cpm <- t(t(countData)/colSums(countData))*1e6
#keep_g <- rowSums(countData_cpm > 1) > 5
#dim(countData[keep_g,])
#countData <- countData[keep_g,]
##===Data normalization
dds <- DESeqDataSetFromMatrix(countData = countData, colData = colData, design= ~ Groups)
#vsd <- vst(dds, blind=FALSE)
#rld <- rlog(dds, blind=FALSE); vsd<-rld
#head(assay(vsd), 3)
PART A. DESeq analysis
dds <- DESeqDataSetFromMatrix(countData = countData, colData = colData, design= ~ Groups)
dds <- DESeq(dds, test = "Wald")
# Assuming 'dds' is already defined and DESeq analysis has been performed
# ...
# Get normalized counts
#normalized_counts <- counts(dds, normalized = TRUE)
# If you want raw counts instead of normalized counts, use:
# raw_counts <- counts(dds, normalized = FALSE)
# Convert normalized counts to a data frame
#normalized_counts_df <- as.data.frame(normalized_counts)
# Save normalized counts to a CSV file
#write.csv(normalized_counts_df, file = "normalized_counts_for_gsea.csv", row.names = TRUE)
Groups: Patient vs Control, F.SCR vs C.SCR, FXNsh2 vs C.SCR, FXNsh3 vs C.SCR, FXNsh2 vs FXNsh3, FXNsh2 vs F.SCR, FXNsh3 vs F.SCR, FXNsh vs C.SCR, FXNsh vs F.SCR
#Generate an MA plot for Patient vs Control
res_Patient_vs_Control <- results(dds, contrast = c("Groups", "Patient", "Control"), pAdjustMethod = "BH")
res_Patient_vs_Control <- res_Patient_vs_Control[order(res_Patient_vs_Control$pvalue),]
summary(res_Patient_vs_Control)
##
## out of 34706 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 736, 2.1%
## LFC < 0 (down) : 347, 1%
## outliers [1] : 11, 0.032%
## low counts [2] : 19394, 56%
## (mean count < 7)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
Analysis 1.4) MA plot
res_Patient_vs_Control <- data.frame(res_Patient_vs_Control)
res_Patient_vs_Control$Symbol <- rownames(res_Patient_vs_Control)
df_MA1 <- res_Patient_vs_Control[,c("baseMean","log2FoldChange","padj","Symbol")]
ggmaplot(df_MA1, main = "Patient vs Control", fdr = 0.05, fc = 1.0, size = 1, legend = "top",
palette = c("green4", "red3","darkgray"), genenames = as.vector(df_MA1$Symbol), top = 20, ylim=c(-10,10),
xlab = "Log2 mean expression", ylab = "Log2 fold change", ggtheme = ggplot2::theme_classic())
## Warning: Removed 25877 rows containing missing values (`geom_point()`).
Analysis 1.4) Volcano Plot
#Volcano Plot
EnhancedVolcano(res_Patient_vs_Control, x="log2FoldChange", y="pvalue", lab=res_Patient_vs_Control$Symbol, pCutoff = 0.05, FCcutoff = 1.0)
#Generate an MA plot for F.SCR vs C.SCR
res_F.SCR_vs_C.SCR <- results(dds, contrast = c("Groups", "F.SCR", "C.SCR"), pAdjustMethod = "BH")
res_F.SCR_vs_C.SCR <- res_F.SCR_vs_C.SCR[order(res_F.SCR_vs_C.SCR$padj),]
summary(res_F.SCR_vs_C.SCR)
##
## out of 34706 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 96, 0.28%
## LFC < 0 (down) : 68, 0.2%
## outliers [1] : 11, 0.032%
## low counts [2] : 18101, 52%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
Analysis 2.4) MA Plot
res_F.SCR_vs_C.SCR <- data.frame(res_F.SCR_vs_C.SCR)
res_F.SCR_vs_C.SCR$Symbol <- rownames(res_F.SCR_vs_C.SCR)
df_MA1 <- res_F.SCR_vs_C.SCR[,c("baseMean","log2FoldChange","padj","Symbol")]
ggmaplot(df_MA1, main = "F.SCR vs C.SCR", fdr = 0.05, fc = 1.0, size = 1, legend = "top",
palette = c("green4", "red3","darkgray"), genenames = as.vector(df_MA1$Symbol), top = 20, ylim=c(-10,10),
xlab = "Log2 mean expression", ylab = "Log2 fold change", ggtheme = ggplot2::theme_classic())
## Warning: Removed 25877 rows containing missing values (`geom_point()`).
Analysis 2.4) Volcano Plot
#Volcano Plot
EnhancedVolcano(res_F.SCR_vs_C.SCR, x="log2FoldChange", y="pvalue", lab=res_F.SCR_vs_C.SCR$Symbol, pCutoff = 0.05, FCcutoff = 1.0)
#Generate an MA plot for FXNSH2 vs C.SCR
res_FXNsh2_vs_C.SCR <- results(dds, contrast = c("Groups", "FXNsh2", "C.SCR"), pAdjustMethod = "BH")
res_FXNsh2_vs_C.SCR <- res_FXNsh2_vs_C.SCR[order(res_FXNsh2_vs_C.SCR$padj),]
summary(res_FXNsh2_vs_C.SCR)
##
## out of 34706 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 2824, 8.1%
## LFC < 0 (down) : 3399, 9.8%
## outliers [1] : 11, 0.032%
## low counts [2] : 16808, 48%
## (mean count < 3)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
Analysis 3.4) MA Plot
res_FXNsh2_vs_C.SCR <- data.frame(res_FXNsh2_vs_C.SCR)
res_FXNsh2_vs_C.SCR$Symbol <- rownames(res_FXNsh2_vs_C.SCR)
df_MA2 <- res_FXNsh2_vs_C.SCR[,c("baseMean","log2FoldChange","padj","Symbol")]
ggmaplot(df_MA2, main = "FXNsh2 vs C.SCR", fdr = 0.05, fc = 1.0, size = 1, legend = "top",
palette = c("green4", "red3","darkgray"), genenames = as.vector(df_MA2$Symbol), top = 20, ylim=c(-10,10),
xlab = "Log2 mean expression", ylab = "Log2 fold change", ggtheme = ggplot2::theme_classic())
## Warning: Removed 25877 rows containing missing values (`geom_point()`).
Analysis 3.4) Volcano Plot
#Volcano Plot
EnhancedVolcano(res_FXNsh2_vs_C.SCR, x="log2FoldChange", y="pvalue",lab=res_FXNsh2_vs_C.SCR$Symbol, pCutoff = 0.05, FCcutoff = 1.0)
#Generate an MA plot for FXNsh3 vs C.SCR
res_FXNsh3_vs_C.SCR <- results(dds, contrast = c("Groups", "FXNsh3", "C.SCR"), pAdjustMethod = "BH")
res_FXNsh3_vs_C.SCR <- res_FXNsh3_vs_C.SCR[order(res_FXNsh3_vs_C.SCR$padj),]
summary(res_FXNsh3_vs_C.SCR)
##
## out of 34706 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 1372, 4%
## LFC < 0 (down) : 1612, 4.6%
## outliers [1] : 11, 0.032%
## low counts [2] : 20687, 60%
## (mean count < 13)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
Analysis 4.4) MA Plot
res_FXNsh3_vs_C.SCR <- data.frame(res_FXNsh3_vs_C.SCR)
res_FXNsh3_vs_C.SCR$Symbol <- rownames(res_FXNsh3_vs_C.SCR)
df_MA1 <- res_FXNsh3_vs_C.SCR[,c("baseMean","log2FoldChange","padj","Symbol")]
ggmaplot(df_MA1, main = "FXNsh3 vs C.SCR", fdr = 0.05, fc = 1.5, size = 1, legend = "top",
palette = c("green4", "red3","darkgray"), genenames = as.vector(df_MA1$Symbol), top = 20, ylim=c(-10,10),
xlab = "Log2 mean expression", ylab = "Log2 fold change", ggtheme = ggplot2::theme_classic())
## Warning: Removed 25877 rows containing missing values (`geom_point()`).
Analysis 4.4) Volcano Plot
#Volcano Plot
EnhancedVolcano(res_FXNsh3_vs_C.SCR, x="log2FoldChange", y="pvalue", lab=res_FXNsh3_vs_C.SCR$Symbol, pCutoff = 0.05, FCcutoff = 1.0)
Analysis 5.4) Fxnsh2 vs Fxnsh3
#Generate an MA plot for Novirus vs FXNSH2
res_FXNsh2_vs_FXNsh3 <- results(dds, contrast = c("Groups", "FXNsh2", "FXNsh3"), pAdjustMethod = "BH")
res_FXNsh2_vs_FXNsh3 <- res_FXNsh2_vs_FXNsh3[order(res_FXNsh2_vs_FXNsh3$pvalue),]
summary(res_FXNsh2_vs_FXNsh3)
##
## out of 34706 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 2462, 7.1%
## LFC < 0 (down) : 2871, 8.3%
## outliers [1] : 11, 0.032%
## low counts [2] : 17454, 50%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
Analysis 5.4) MA Plot
res_FXNsh2_vs_FXNsh3 <- data.frame(res_FXNsh2_vs_FXNsh3)
res_FXNsh2_vs_FXNsh3$Symbol <- rownames(res_FXNsh2_vs_FXNsh3)
df_MA3 <- res_FXNsh2_vs_FXNsh3[,c("baseMean","log2FoldChange","padj","Symbol")]
ggmaplot(df_MA3, main = "FXNsh2_vs_FXNsh3", fdr = 0.05, fc = 1.0, size = 1, legend = "top",
palette = c("green4", "red3","darkgray"), genenames = as.vector(df_MA3$Symbol), top = 20, ylim=c(-10,10),
xlab = "Log2 mean expression", ylab = "Log2 fold change", ggtheme = ggplot2::theme_classic())
## Warning: Removed 25877 rows containing missing values (`geom_point()`).
Analysis 5.4) Volcano Plot
#Volcano Plot
EnhancedVolcano(res_FXNsh2_vs_FXNsh3, x="log2FoldChange", y="pvalue", lab=res_FXNsh2_vs_FXNsh3$Symbol, pCutoff = 0.05, FCcutoff = 1.0)
Analysis 6.4) Fxnsh2 vs F.SCR
#Generate an MA plot for Novirus vs FXNSH2
res_FXNsh2_vs_F.SCR <- results(dds, contrast = c("Groups", "FXNsh2", "F.SCR"), pAdjustMethod = "BH")
res_FXNsh2_vs_F.SCR <- res_FXNsh2_vs_F.SCR[order(res_FXNsh2_vs_F.SCR$pvalue),]
summary(res_FXNsh2_vs_F.SCR)
##
## out of 34706 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 3727, 11%
## LFC < 0 (down) : 4148, 12%
## outliers [1] : 11, 0.032%
## low counts [2] : 14869, 43%
## (mean count < 2)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
Analysis 6.4) MA Plot
res_FXNsh2_vs_F.SCR <- data.frame(res_FXNsh2_vs_F.SCR)
res_FXNsh2_vs_F.SCR$Symbol <- rownames(res_FXNsh2_vs_F.SCR)
df_MA3 <- res_FXNsh2_vs_F.SCR[,c("baseMean","log2FoldChange","padj","Symbol")]
ggmaplot(df_MA3, main = "FXNsh2_vs_F.SCR", fdr = 0.05, fc = 1.0, size = 1, legend = "top",
palette = c("green4", "red3","darkgray"), genenames = as.vector(df_MA3$Symbol), top = 20, ylim=c(-10,10),
xlab = "Log2 mean expression", ylab = "Log2 fold change", ggtheme = ggplot2::theme_classic())
## Warning: Removed 25877 rows containing missing values (`geom_point()`).
Analysis 6.4) Volcano Plot
#Volcano Plot
EnhancedVolcano(res_FXNsh2_vs_F.SCR, x="log2FoldChange", y="pvalue", lab=res_FXNsh2_vs_F.SCR$Symbol, pCutoff = 0.05, FCcutoff = 1.0)
Analysis 7.4) Fxnsh3 vs F.SCR
#Generate an MA plot for Novirus vs FXNSH2
res_FXNsh3_vs_F.SCR <- results(dds, contrast = c("Groups", "FXNsh3", "F.SCR"), pAdjustMethod = "BH")
res_FXNsh3_vs_F.SCR <- res_FXNsh2_vs_FXNsh3[order(res_FXNsh3_vs_F.SCR$pvalue),]
summary(res_FXNsh3_vs_F.SCR)
## baseMean log2FoldChange lfcSE stat
## Min. : 0.0 Min. :-7.660 Min. :0.075 Min. :-20.129
## 1st Qu.: 0.0 1st Qu.:-0.558 1st Qu.:0.337 1st Qu.: -0.452
## Median : 0.1 Median : 0.000 Median :1.483 Median : 0.000
## Mean : 179.0 Mean : 0.021 Mean :2.531 Mean : -0.043
## 3rd Qu.: 7.9 3rd Qu.: 0.684 3rd Qu.:5.683 3rd Qu.: 0.518
## Max. :378100.3 Max. : 7.794 Max. :5.771 Max. : 14.965
## NA's :25877 NA's :25877 NA's :25877
## pvalue padj Symbol
## Min. :0.000 Min. :0.00 Length:60583
## 1st Qu.:0.157 1st Qu.:0.05 Class :character
## Median :0.631 Median :0.35 Mode :character
## Mean :0.542 Mean :0.40
## 3rd Qu.:0.842 3rd Qu.:0.73
## Max. :1.000 Max. :1.00
## NA's :25888 NA's :43342
Analysis 7.4) MA Plot
res_FXNsh3_vs_F.SCR <- data.frame(res_FXNsh3_vs_F.SCR)
res_FXNsh3_vs_F.SCR$Symbol <- rownames(res_FXNsh3_vs_F.SCR)
df_MA3 <- res_FXNsh3_vs_F.SCR[,c("baseMean","log2FoldChange","padj","Symbol")]
ggmaplot(df_MA3, main = "FXNsh3_vs_F.SCR", fdr = 0.05, fc = 1.0, size = 1, legend = "top",
palette = c("green4", "red3","darkgray"), genenames = as.vector(df_MA3$Symbol), top = 20, ylim=c(-10,10),
xlab = "Log2 mean expression", ylab = "Log2 fold change", ggtheme = ggplot2::theme_classic())
## Warning: Removed 25877 rows containing missing values (`geom_point()`).
Analysis 7.4) Volcano Plot
#Volcano Plot
EnhancedVolcano(res_FXNsh3_vs_F.SCR, x="log2FoldChange", y="pvalue", lab=res_FXNsh3_vs_F.SCR$Symbol, pCutoff = 0.05, FCcutoff = 1.0)
DESeq Analyses
dds <- DESeqDataSetFromMatrix(countData = countData, colData = colData, design= ~ Group1)
dds <- DESeq(dds, test = "Wald")
Analysis 8.4) FXNsh vs F.SCR
#Generate an MA plot for Novirus vs FXNSH2
res_FXNsh_vs_F.SCR <- results(dds, contrast = c("Group1", "FXNsh", "F.SCR"), pAdjustMethod = "BH")
res_FXNsh_vs_F.SCR <- res_FXNsh_vs_F.SCR[order(res_FXNsh_vs_F.SCR$pvalue),]
summary(res_FXNsh_vs_F.SCR)
##
## out of 34706 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 2823, 8.1%
## LFC < 0 (down) : 2859, 8.2%
## outliers [1] : 32, 0.092%
## low counts [2] : 15515, 45%
## (mean count < 2)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
Analysis 8.4) MA Plot
res_FXNsh_vs_F.SCR <- data.frame(res_FXNsh_vs_F.SCR)
res_FXNsh_vs_F.SCR$Symbol <- rownames(res_FXNsh_vs_F.SCR)
df_MA3 <- res_FXNsh_vs_F.SCR[,c("baseMean","log2FoldChange","padj","Symbol")]
ggmaplot(df_MA3, main = "FXNsh vs F.SCR", fdr = 0.05, fc = 1.0, size = 1, legend = "top",
palette = c("green4", "red3","darkgray"), genenames = as.vector(df_MA3$Symbol), top = 20, ylim=c(-10,10),
xlab = "Log2 mean expression", ylab = "Log2 fold change", ggtheme = ggplot2::theme_classic())
## Warning: Removed 25877 rows containing missing values (`geom_point()`).
Analysis 8.4) Volcano Plot
#Volcano Plot
EnhancedVolcano(res_FXNsh_vs_F.SCR, x="log2FoldChange", y="pvalue", lab=res_FXNsh_vs_F.SCR$Symbol, pCutoff = 0.05, FCcutoff = 1.0)
Analysis 9.4) FXNsh vs C.SCR
#Generate an MA plot for SCR vs Novirus
res_FXNsh_vs_C.SCR <- results(dds, contrast = c("Group1", "FXNsh", "C.SCR"), pAdjustMethod = "BH")
res_FXNsh_vs_C.SCR <- res_FXNsh_vs_C.SCR[order(res_FXNsh_vs_C.SCR$pvalue),]
summary(res_FXNsh_vs_C.SCR)
##
## out of 34706 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 1433, 4.1%
## LFC < 0 (down) : 1524, 4.4%
## outliers [1] : 32, 0.092%
## low counts [2] : 19391, 56%
## (mean count < 7)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
Analysis 9.4) MA Plot
res_FXNsh_vs_C.SCR <- data.frame(res_FXNsh_vs_C.SCR)
res_FXNsh_vs_C.SCR$Symbol <- rownames(res_FXNsh_vs_C.SCR)
df_MA4 <- res_FXNsh_vs_C.SCR[,c("baseMean","log2FoldChange","padj","Symbol")]
ggmaplot(df_MA4, main = "FXNsh vs C.SCR", fdr = 0.05, fc = 1.0, size = 1, legend = "top",
palette = c("green4", "red3","darkgray"), genenames = as.vector(df_MA4$Symbol), top = 20, ylim=c(-10,10),
xlab = "Log2 mean expression", ylab = "Log2 fold change", ggtheme = ggplot2::theme_classic())
## Warning: Removed 25877 rows containing missing values (`geom_point()`).
Analysis 9.4) Volcano PLot
#Volcano Plot
EnhancedVolcano(res_FXNsh_vs_C.SCR, x="log2FoldChange", y="pvalue", lab=res_FXNsh_vs_C.SCR$Symbol, pCutoff = 0.05, FCcutoff = 1.0)